/* --------------------------------------------------------------  */
/* (C)Copyright 2001,2007,                                         */
/* International Business Machines Corporation,                    */
/* Sony Computer Entertainment, Incorporated,                      */
/* Toshiba Corporation,                                            */
/*                                                                 */
/* All Rights Reserved.                                            */
/* --------------------------------------------------------------  */
/* PROLOG END TAG zYx                                              */
#include "../common.h"

#include <libspe2.h>
#include <pthread.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/wait.h>
#include <string.h>
#include <math.h>
#include <stdbool.h>

#include <sys/time.h>


#define sign(n) ((n) < 0.0 ? -1.0 : ((n) > 0.0 ? 1.0 : 0.0) )
#define min(a,b) ((a) < (b) ? a : b)

struct timezone tzone;
struct timeval time0, time1;
int sec, usec;

void print_matrix(struct sparse_float_matrix* matrix)
{
	size_t i, j;

	printf("matrix[%d][%d]:\n", matrix->uiRows, matrix->uiCols);

	for (i = 0; i < matrix->uiRows; i++)
	{
		printf("matrix[%d] has %d nonzero columns\n", i, matrix->rows[i].uiCols);

		for (j = 0; j < matrix->rows[i].uiCols; j++)
		{
			printf("matrix[%d][%d] = %f\n", i, matrix->rows[i].elements[j].uiCol,
				matrix->rows[i].elements[j].data);
		} 
	}
}

struct complex_float jacobi(float a, float b, float g)
{
	struct complex_float x;

	float w = (b - a) / (2.0 * g);

	if (w == 0)
	{
		puts("w equals zero!!!");
	}

	float t = sign(w) / (fabs(w) + sqrt(1.0 + w*w));
	x.real = 1.0 / sqrt(1.0 + t*t);
	x.imag = t * x.real;

	debug2("t = %f / (%f + sqrt(1.0 + %f*%f)) = %f\n", sign(w), fabs(w), w, w, t);

	return x;
}

float sparse_inner_product(struct sparse_float_matrix_row row1, struct sparse_float_matrix_row row2)
{
	size_t idx1 = 0, idx2 = 0;
	bool finished = false;
	float innerprod = 0.0;

	while (!finished)
	{
		if (row1.elements[idx1].uiCol < row2.elements[idx2].uiCol)
		{
			idx1++;
		}
		else if (row1.elements[idx1].uiCol > row2.elements[idx2].uiCol)
		{
			idx2++;
		}
		else
		{
			innerprod += row1.elements[idx1].data * row2.elements[idx2].data;
			idx1++;
			idx2++;
		}

		if (idx1 >= row1.uiCols) finished = true;
		if (idx2 >= row2.uiCols) finished = true;
	}

	return innerprod;
}

bool hestenes_jacobi_calculation(struct sparse_float_matrix* matrix, size_t lb_i, size_t ub_i, 
	size_t lb_j, size_t ub_j, float G_matrix[R][R], struct complex_float P_matrix[R][R], float delta)
{
	debug1("hestenes_jacobi_calculation [%d, %d] x [%d, %d]\n", lb_i, ub_i, lb_j, ub_j);

	bool converged = true;
	bool finished = false;
	size_t i, j;
	size_t num_rows_i = ub_i - lb_i + 1;
	size_t num_rows_j = ub_j - lb_j + 1;
	size_t idx_i, idx_j, col_index_i, col_index_j;
	float a, b, g, data_i, data_j;


	// Matrix of indicies, to mark our progress between pairs of rows from block i and block j
	struct index_pair J_matrix[R][R];

	// Vectors of norms for row block i and row block j
	float theta_i[R];
	float theta_j[R];

	// TBD: Use memset on each matrix/vector to initialize
	for (i = 0; i < R; i++)
	{
		for (j = 0; j < R; j++)
		{
			J_matrix[i][j].idx_left = 0;
			J_matrix[i][j].idx_right = 0;

			G_matrix[i][j] = 0.0;

			P_matrix[i][j].real = 0.0;
			P_matrix[i][j].imag = 0.0;
		}
	}

	for (i = 0; i < R; i++)
	{
		theta_i[i] = 0.0;
	}

	for (j = 0; j < R; j++)
	{
		theta_j[j] = 0.0;
	}

	debug1("Calculation: Finished initializing stuff\n");

	// Calculate dotproducts of all pairs from block i and block j
	// J_matrix keeps track of the state of each pairwise computation
	// TBD: Is this going to make local store management difficult? (since we might need 
	// many pieces of a row at once)
	do
	{
		finished = true;
		for (i = lb_i; i <= ub_i; i++)
		{
			for (j = lb_j; j <= ub_j; j++)
			{
				if (i >= j)
				{
					continue;
				}

				idx_i = J_matrix[i % R][j % R].idx_left;
				idx_j = J_matrix[i % R][j % R].idx_right;

				// Only skip this iteration if we're past the end of *both* vectors
				if (idx_i >= matrix->rows[i].uiCols && idx_j >= matrix->rows[j].uiCols)
				{
					continue;
				}
				finished = false;

				data_i = matrix->rows[i].elements[idx_i].data;
				data_j = matrix->rows[j].elements[idx_j].data;
				col_index_i = matrix->rows[i].elements[idx_i].uiCol;
				col_index_j = matrix->rows[j].elements[idx_j].uiCol;

				if (col_index_i < col_index_j)
				{
					theta_i[i % R] += data_i * data_i;
					idx_i++;
				}
				else if (col_index_i > col_index_j)
				{
					theta_j[j % R] += data_j * data_j;
					idx_j++;
				}
				else
				{
					G_matrix[i % R][j % R] += data_i * data_j;

					theta_i[i % R] += data_i * data_i;
					theta_j[j % R] += data_j * data_j;
					idx_i++;
					idx_j++;
				}

				J_matrix[i % R][j % R].idx_left = idx_i;
				J_matrix[i % R][j % R].idx_right = idx_j;
			}
		}
	} while (!finished);

	debug1("Calculation: Finished calculating theta vectors and P matrix\n");

	// In the loop above, we may have multi-counted row norms stored in the theta vectors.
	// This will happen if the two row blocks have different sets of vectors (which is 
	// the most frequest case)
	//
	// Each pair that a row has particpated in adds a multple of the norm to theta
	// We now divide each norm by that number of pairs to obtain the correct norm

	if (lb_i != lb_j && ub_i != ub_j)
	{
		for (i = lb_i; i <= ub_i; i++)
		{
			debug3("Dividing %f by %d\n", theta_i[i % R], num_rows_j);
			theta_i[i % R] = theta_i[i % R] / num_rows_j;
		}

		for (j = lb_j; j <= ub_j; j++)
		{
			debug3("Dividing %f by %d\n", theta_j[j % R], num_rows_i);
			theta_j[j % R] = theta_j[j % R] / num_rows_i;
		}
	}

	// Now the norms and dotproducts should be calculated, so we can calculate the jacobi values
	for (i = lb_i; i <= ub_i; i++)
	{
		for (j = lb_j; j <= ub_j; j++)
		{
			if (i >= j)
			{
				continue;
			}

			a = theta_i[i % R];
			b = theta_j[j % R];
			g = G_matrix[i % R][j % R];

			debug2("a = norm(row_block[i, %d]) = %f\n", i, a);
			debug2("b = norm(row_block[j, %d]) = %f\n", j, b);
			debug2("g = dotprod(row_block[i, %d], row_block[j, %d]) = %f\n", i, j, g);

			if (fabs(g) > delta)
			{
				converged = false;
			}

			if (fabs(g) > EPSILON)
			{
				P_matrix[i % R][j % R] = jacobi(a, b, g);
			}
			else
			{
				P_matrix[i % R][j % R].real = 1.0;
				P_matrix[i % R][j % R].imag = 0.0;
			}

			debug2("P for row pair(%d, %d) = (%f, %f)\n", i, j, 
				P_matrix[i % R][j % R].real, P_matrix[i % R][j % R].imag);
		}
	}

	return converged;
}

void hestenes_jacobi_application(struct sparse_float_matrix* matrix, size_t lb_i, size_t ub_i, 
	size_t lb_j, size_t ub_j, struct complex_float P_matrix[R][R])
{
	debug1("hestenes_jacobi_application [%d, %d] x [%d, %d]\n", lb_i, ub_i, lb_j, ub_j);

	bool finished = false;
	size_t i, j;
	size_t idx_i, idx_j;
	float data_i, data_j;

	// Matrix of indicies, to mark our progress between pairs of rows from block i and block j
	struct index_pair J_matrix[R][R];

	// TBD: Use memset on each matrix/vector to initialize
	for (i = 0; i < R; i++)
	{
		for (j = 0; j < R; j++)
		{
			J_matrix[i][j].idx_left = 0;
			J_matrix[i][j].idx_right = 0;
		}
	}

	do
	{
		finished = true;
		for (i = lb_i; i <= ub_i; i++)
		{
			for (j = lb_j; j <= ub_j; j++)
			{
				if (i >= j)
				{
					continue;
				}

				idx_i = J_matrix[i % R][j % R].idx_left;
				idx_j = J_matrix[i % R][j % R].idx_right;

				// If we're past the end of either row, the application will have no effect,
				// so it's safe to skip
				if (idx_i >= matrix->rows[i].uiCols || idx_j >= matrix->rows[j].uiCols)
				{
					continue;
				}
				finished = false;

				if (matrix->rows[i].elements[idx_i].uiCol < matrix->rows[j].elements[idx_j].uiCol)
				{
					idx_i++;
				}
				else if (matrix->rows[i].elements[idx_i].uiCol > matrix->rows[j].elements[idx_j].uiCol)
				{
					idx_j++;
				}
				else
				{
					data_i = matrix->rows[i].elements[idx_i].data;
					data_j = matrix->rows[j].elements[idx_j].data;
					struct complex_float x = P_matrix[i % R][j % R];

					matrix->rows[i].elements[idx_i].data = x.real * data_i - x.imag * data_j;
					matrix->rows[j].elements[idx_j].data = x.imag * data_i + x.real * data_j;

					debug2("app rows %d and %d\n", i, j);
					debug2("matrix->rows[%d].elements[%d].data <- %f * %f - %f * %f = %f\n", 
						i, idx_i, x.real, data_i, x.imag, data_j, 
						matrix->rows[i].elements[idx_i].data);

					debug2("matrix->rows[%d].elements[%d].data <- %f * %f - %f * %f = %f\n", 
						j, idx_j, x.imag, data_i, x.real, data_j, 
						matrix->rows[j].elements[idx_j].data);
					debug2("\n");

					idx_i++;
					idx_j++;
				}

				J_matrix[i % R][j % R].idx_left = idx_i;
				J_matrix[i % R][j % R].idx_right = idx_j;
			}
		}
	} while (!finished);
}

bool process_work_block(struct sparse_float_matrix* matrix, size_t lb_i, size_t ub_i, 
	size_t lb_j, size_t ub_j, float delta)
{
	debug1("Processing work block [%d, %d] x [%d, %d]\n", lb_i, ub_i, lb_j, ub_j);

	bool converged = true;

	// TBD: May want to reuse these rather than create them for each job
	// G is a matrix of dotproducts, P is a matrix of jacobi calculations
	// These are stored between pair of rows from block i and block j
	float G_matrix[R][R];
	struct complex_float P_matrix[R][R];

	// Perform the HJ calculation, then the HJ application
	converged &= hestenes_jacobi_calculation(matrix, lb_i, ub_i, lb_j, ub_j, G_matrix, P_matrix, delta);
	hestenes_jacobi_application(matrix, lb_i, ub_i, lb_j, ub_j, P_matrix);

	return converged;
}

void do_svd(struct sparse_float_matrix* matrix, size_t row_block_size)
{
	size_t i, diagonal, row_idx;
	float sumdotprods = 0.0, dotprod;
	bool converged = false;
	size_t lb_i, ub_i, lb_j, ub_j;
	float sigma[matrix->uiRows];
	size_t iterations = 0;

	for (i = 0; i < matrix->uiRows; i++)
	{
		dotprod = sparse_inner_product(matrix->rows[i], matrix->rows[i]);
		debug1("row[%d] dot row[%d] = %f\n", i, i, dotprod); fflush(stdout);
		sumdotprods += dotprod;
	}

	float delta = EPSILON * sumdotprods;
	debug1("delta = %f\n", delta); fflush(stdout);

	size_t num_blocks = (size_t) ceil(matrix->uiRows * 1.0 / row_block_size);

	do
	{
		iterations++;
		debug1("Starting iteration = %d\n", iterations); fflush(stdout);

		converged = true;
		for (diagonal = 0; diagonal < num_blocks; diagonal++)
		{
			debug1("Processing diagonal %d of the work matrix\n", diagonal);
			for (row_idx = 0; row_idx < num_blocks - diagonal; row_idx++)
			{
				lb_i = row_idx * row_block_size;
				ub_i = (size_t) min(lb_i + row_block_size - 1, matrix->uiRows - 1);
				lb_j = (row_idx + diagonal) * row_block_size;
				ub_j = (size_t) min(lb_j + row_block_size - 1, matrix->uiRows - 1);
				converged = process_work_block(matrix, lb_i, ub_i, lb_j, ub_j, delta);
			}
		}

	} while (converged != true);

	// Now find the sigmas
	// TBD: Should we return this to the caller? Or write it to disk? Should we return anything else?
	for (i = 0; i < matrix->uiRows; i++)
	{
		sigma[i] = sqrt( sparse_inner_product(matrix->rows[i], matrix->rows[i]) );
		printf("sigma[%d] = %f\n", i, sigma[i]); fflush(stdout);
	}

}

int main(void)
{
	size_t M = 10, N = 3;
	size_t i = 0, j;

	// 3.0000    4.0000    1.0000
	// 8.0000    1.0000    9.0000
	// 8.0000    7.0000    7.0000
	// 2.0000    2.0000    2.0000
	// 4.0000    6.0000    2.0000
	// 9.0000    1.0000    6.0000
	// 9.0000    6.0000    9.0000
	// 2.0000    4.0000    8.0000
	// 8.0000    6.0000    3.0000
	// 8.0000    2.0000    5.0000

	struct sparse_float_matrix matrix;
	matrix.uiRows = M;
	matrix.uiCols = N;
	matrix.rows = malloc(M * sizeof(struct sparse_float_matrix_row));

	for (i = 0; i < matrix.uiRows; i++)
	{
		matrix.rows[i].uiCols = matrix.uiCols;
		matrix.rows[i].elements = malloc(matrix.rows[i].uiCols * sizeof(struct sparse_float_matrix_row_element));
		matrix.rows[i].elements[0].uiCol = 0;
		matrix.rows[i].elements[1].uiCol = 1;
		matrix.rows[i].elements[2].uiCol = 2;
	}

	i=0;
	matrix.rows[i].elements[0].data = 3.0;
	matrix.rows[i].elements[1].data = 4.0;
	matrix.rows[i].elements[2].data = 1.0;

	i++;
	matrix.rows[i].elements[0].data = 8.0;
	matrix.rows[i].elements[1].data = 1.0;
	matrix.rows[i].elements[2].data = 9.0;

	i++;
	matrix.rows[i].elements[0].data = 8.0;
	matrix.rows[i].elements[1].data = 7.0;
	matrix.rows[i].elements[2].data = 7.0;

	i++;
	matrix.rows[i].elements[0].data = 2.0;
	matrix.rows[i].elements[1].data = 2.0;
	matrix.rows[i].elements[2].data = 2.0;

	i++;
	matrix.rows[i].elements[0].data = 4.0;
	matrix.rows[i].elements[1].data = 6.0;
	matrix.rows[i].elements[2].data = 2.0;

	i++;
	matrix.rows[i].elements[0].data = 9.0;
	matrix.rows[i].elements[1].data = 1.0;
	matrix.rows[i].elements[2].data = 6.0;

	i++;
	matrix.rows[i].elements[0].data = 9.0;
	matrix.rows[i].elements[1].data = 6.0;
	matrix.rows[i].elements[2].data = 9.0;

	i++;
	matrix.rows[i].elements[0].data = 2.0;
	matrix.rows[i].elements[1].data = 4.0;
	matrix.rows[i].elements[2].data = 8.0;

	i++;
	matrix.rows[i].elements[0].data = 8.0;
	matrix.rows[i].elements[1].data = 6.0;
	matrix.rows[i].elements[2].data = 3.0;

	i++;
	matrix.rows[i].elements[0].data = 8.0;
	matrix.rows[i].elements[1].data = 2.0;
	matrix.rows[i].elements[2].data = 5.0;


	// Initialize a small (but dense) matrix for testing
	/*
	struct sparse_float_matrix matrix;
	matrix.uiRows = M;
	matrix.uiCols = N;
	matrix.rows = malloc(M * sizeof(struct sparse_float_matrix_row));
	for (i = 0; i < matrix.uiRows; i++)
	{
		matrix.rows[i].uiCols = matrix.uiCols;
		matrix.rows[i].elements = malloc(matrix.rows[i].uiCols * sizeof(struct sparse_float_matrix_row_element));
		for (j = 0; j < matrix.rows[i].uiCols; j++)
		{
			matrix.rows[i].elements[j].uiCol = j;
			matrix.rows[i].elements[j].data = i * N * 1.0 + j;
		}
	}
	*/

	print_matrix(&matrix);

	printf("PPE-only SVD, M = %d, N = %d, R = %d\n", M, N, R);  fflush(stdout);

	gettimeofday( &time0, &tzone );


	// Algorithm starts here
	do_svd(&matrix, R);
	// Algorithm ends here

	print_matrix(&matrix);

	gettimeofday( &time1, &tzone );

	sec  = time1.tv_sec  - time0.tv_sec ;
	usec = time1.tv_usec - time0.tv_usec ;
	if ( usec < 0 ) { sec--; usec+=1000000 ; }


	printf("PPE: Done matrix[%d][%d]: time=%d.%d\n",M,N,sec,usec);  fflush(stdout);

	# ifndef NEVER
	{
		int error ;
		error = 0;

		// Check results here

		if (error) { printf("Error in SVD?.\n"); fflush(stdout); }
		else { printf("SVD is correct?\n"); fflush(stdout); }
	}
	# endif

	# ifdef NEVER
		// print results here
	# endif


	return 0;
}
